image: nvcr.io/nvidia/pytorch:23.04-py3

stages:
  - test
  - jet
  - cleanup

variables: &VARS
  SELENE_ADLR_CI_PATH: "/lustre/fsw/adlr/adlr-nlp/adlr_ci/megatron"
  DATA_DIR: "/lustre/fsw/adlr/adlr-nlp/adlr_ci/megatron/data"
  PYTORCH_IMAGE: /lustre/fsw/adlr/adlr-nlp/adlr_ci/megatron/nvcr_pytorch_23.04.sqsh # This is the image that is run by all nodes on selene for tests
  PYTHON_VIRTUAL_ENV: /lustre/fsw/adlr/adlr-nlp/adlr_ci/cienv/bin/activate
  TESTS_TO_RUN_AFTER_MERGE_REQ_APPROVED: "MR_TESTS JET" # Can specify levels
  TESTS_TO_RUN_AFTER_MERGING: "MR_TESTS NIGHTLY_TESTS" # Can specify levels
  TESTS_TO_RUN_ON_THIS_COMMIT: unit_tests
  TEST_REGEX_ON_THIS_COMMIT:  NONE #https://github.com/google/re2/wiki/Syntax (Can define regex as in this spec) e.g /.*gpt3.*/
  JET_CUSTOM_FILTER: ""
  DISPLAY_OUTPUT: "True" # Set to true for new tests to copy the logs for creating golden truth file
  TIME_LIMIT: "10:00" # Default time limit for all jobs
  MOE_GROUPED_GEMM: 0 # Set to 1 to enable grouped gemm for MoE


include:
  - jet-tests.yml

unit_tests:
  image: nvcr.io/nvidia/pytorch:23.04-py3
  tags:
    - docker_local_runner
  stage: test
  script:
    - pip install pytest-cov
    - pip install pytest_mock
    - pip install nltk
    - pip install wrapt
    - pip install zarr "tensorstore==0.1.45"  # for distributed checkpointing tests
    - pip install git+https://github.com/fanshiqing/grouped_gemm@main  # for grouped gemm tests
    - torchrun --nproc_per_node=8 -m pytest --cov-report=term --cov-report=html --cov=megatron/core tests/unit_tests
  coverage: '/(?i)total.*? (100(?:\.0+)?\%|[1-9]?\d(?:\.\d+)?\%)$/'
  artifacts:
    paths:
      - coverage
    expire_in: 30 days
  rules:
    - when: always

docs_build_test:
  stage: test
  tags:
    - docker_local_runner
  script:
    - cd ..
    - rm -rf documentation && git clone https://gitlab-ci-token:${CI_JOB_TOKEN}@gitlab-master.nvidia.com/nemo-megatron-core-tme/documentation.git
    - mv megatron-lm/ documentation/
    - cd documentation/
    - ./repo docs
  allow_failure: true
  except:
    - main

formatting:
  image: nvcr.io/nvidia/pytorch:23.04-py3
  tags:
    - docker_local_runner
  stage: test
  script:
    - pip install --upgrade black==19.10b0 isort click==8.0.2
    - black megatron/core --check --verbose --diff
    - isort megatron/core --check
  rules:
    - when: always

.selene_test_launcher: &selene-test-launcher
  tags:
    - ssh_selene_runner
  stage: test
  script: &selene-test-launcher-script
    - echo "Running selene test"
    - pwd
    - run_cmd="bash tests/functional_tests/shell_test_utils/run_selene_test_launcher_script.sh RUN_MODEL=$RUN_MODEL TP_SIZE=$TP_SIZE PP_SIZE=$PP_SIZE VP_SIZE=$VP_SIZE NUM_NODES=$NUM_NODES SELENE_ADLR_CI_PATH=$SELENE_ADLR_CI_PATH CI_PIPELINE_ID=$CI_PIPELINE_ID RUN_NAME=$RUN_NAME MAX_STEPS=$MAX_STEPS PYTORCH_IMAGE=$PYTORCH_IMAGE DATA_DIR=$DATA_DIR USE_CORE=$USE_CORE USE_TE=$USE_TE MOE_GROUPED_GEMM=$MOE_GROUPED_GEMM TIME_LIMIT=$TIME_LIMIT"
    - echo "$run_cmd"
    - ${run_cmd}
    - echo "Completed the job"
  rules:
    - if: $TEST_LEVEL =~ $TESTS_TO_RUN_ON_THIS_COMMIT || $CI_JOB_NAME =~ $TESTS_TO_RUN_ON_THIS_COMMIT || $CI_JOB_NAME =~ $TEST_REGEX_ON_THIS_COMMIT
      when: always
    - if: '$CI_COMMIT_REF_NAME == $CI_DEFAULT_BRANCH && $TEST_LEVEL =~ $TESTS_TO_RUN_AFTER_MERGING'
      when: always
    - if: $CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_APPROVED && $TEST_LEVEL =~ $TESTS_TO_RUN_AFTER_MERGE_REQ_APPROVED
      when: always
    - if: $CI_PIPELINE_SOURCE == "merge_request_event" && $CI_MERGE_REQUEST_LABELS =~ "READY FOR REVIEW" && $TEST_LEVEL =~ $TESTS_TO_RUN_AFTER_MERGE_REQ_APPROVED
      when: always
  allow_failure: false
  retry: 2

train.bert_core.345m_tp1_pp2_1node_50steps_rope:
  <<: *selene-test-launcher
  variables:
    <<: [*VARS]
    RUN_MODEL: bert
    TP_SIZE: 1
    PP_SIZE: 2
    NUM_NODES: 1
    USE_CORE: 1
    MAX_STEPS: 50
    TIME_LIMIT: "20:00"
    TEST_LEVEL: L0
    METADATA: rope_embeddings
    ADDITIONAL_PARAMS: "--position-embedding-type rope"

train.bert_core.345m_tp1_pp2_1node_50steps_sequence_parallel:
  <<: *selene-test-launcher
  variables:
    <<: [*VARS]
    RUN_MODEL: bert
    TP_SIZE: 1
    PP_SIZE: 2
    NUM_NODES: 1
    USE_CORE: 1
    MAX_STEPS: 50
    TIME_LIMIT: "20:00"
    TEST_LEVEL: L0
    METADATA: sequence_parallel
    ADDITIONAL_PARAMS: "--sequence-parallel"

train.retro_core.tp1_pp1_1node_50steps:
  <<: *selene-test-launcher
  variables:
    <<: [*VARS]
    RUN_MODEL: retro
    USE_TE: 0
    USE_CORE: 1
    TP_SIZE: 1
    PP_SIZE: 1
    NUM_NODES: 1
    MAX_STEPS: 50
    TIME_LIMIT: "20:00"
    TEST_LEVEL: MONTHLY_TESTS
